Source code for nlp_architect.utils.mrc_utils

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

import numpy as np


[docs]def max_values_squad(data_train): """ Function to compute the maximum length of sentences in paragraphs and questions Args: ----------- data_train: list containing the entire dataset Returns: -------- maximum length of question and paragraph """ max_hypothesis = len(data_train[0][1]) max_premise = len(data_train[0][0]) for ele in data_train: len1 = len(ele[0]) len2 = len(ele[1]) if max_premise < len1: max_premise = len1 if max_hypothesis < len2: max_hypothesis = len2 return max_premise, max_hypothesis
[docs]def get_qids(args, q_id_path, data_dev): """ Function to create a list of question_ids in dev set Args: ----------- q_id_path: path to question_ids file data_dev: development set Returns: -------- list of question ids """ qids_list = [] with open(q_id_path) as q_ids: for ele in q_ids: qids_list.append(ele.strip().replace(" ", "")) final_qidlist = [] count = 0 for ele in data_dev: para_idx = ele[0] if len(para_idx) < args.max_para: final_qidlist.append(qids_list[count]) count += 1 return final_qidlist
[docs]def create_squad_training(paras_file, ques_file, answer_file, data_train_len=None): """ Function to read data from preprocessed files and return data in the form of a list Args: ----------- paras_file: File name for preprocessed paragraphs ques_file: File name for preprocessed questions answer_file: File name for preprocessed answer spans vocab_file: File name for preprocessed vocab data_train_len= length of train dataset to use Returns: -------- appended list for train/dev dataset """ para_list = [] ques_list = [] ans_list = [] with open(paras_file) as f_para: for ele in f_para: para_list.append(list(map(int, ele.strip().split()))) with open(ques_file) as f_ques: for ele in f_ques: ques_list.append(list(map(int, ele.strip().split()))) with open(answer_file) as f_ans: for ele in f_ans: ans_list.append(list(map(int, ele.strip().split()))) data_train = [] if data_train_len is None: data_train_len = len(para_list) for idx in range(data_train_len): data_train.append([para_list[idx], ques_list[idx], ans_list[idx]]) return data_train
[docs]def get_data_array_squad(params_dict, data_train, set_val='train'): """ Function to pad all sentences and restrict to max length defined by user Args: --------- params_dict: dictionary containing all input parameters data_train: list containing the training/dev data_train set_val: indicates id its a training set or dev set Returns: ---------- Returns a list of tuples with padded sentences and masks """ max_para = params_dict['max_para'] max_question = params_dict['max_question'] train_set = [] count = 0 for ele in data_train: para = ele[0] para_idx = ele[0] if len(para_idx) < max_para: pad_length = max_para - len(para_idx) para_idx = para_idx + [0] * pad_length para_len = len(para) para_mask = np.zeros([1, max_para]) para_mask[0, 0:len(para)] = 1 para_mask = para_mask.tolist()[0] question_idx = ele[1] question = ele[1] if len(question) < max_question: pad_length = max_question - len(question) question_idx = question_idx + [0] * pad_length question_len = len(question) ques_mask = np.zeros([1, max_question]) ques_mask[0, 0:question_len] = 1 ques_mask = ques_mask.tolist()[0] train_set.append((para_idx, question_idx, para_len, question_len, ele[2], para_mask, ques_mask)) if set_val == 'train': count += 1 if count >= params_dict['train_set_size']: break return train_set
[docs]def create_data_dict(data): """ Function to convert data to dictionary format Args: ----------- data: train/dev data as a list Returns: -------- a dictionary containing dev/train data """ train = {} train['para'] = [] train['answer'] = [] train['question'] = [] train['question_len'] = [] train['para_len'] = [] train['para_mask'] = [] train['question_mask'] = [] for (para_idx, question_idx, para_len, question_len, answer, para_mask, ques_mask) in data: train['para'].append(para_idx) train['question'].append(question_idx) train['para_len'].append(para_len) train['question_len'].append(question_len) train['answer'].append(answer) train['para_mask'].append(para_mask) train['question_mask'].append(ques_mask) return train